Let’s explore how things work when the dependency between two variables is accurately characterized with the regression line.
set.seed(1)
n <- 100
E <- rnorm(n, mean=0, sd=1)
X <- rnorm(n, mean=0, sd=0.25)
Y <- 920 + 10*X + E # simple linear regression
simulated_data <- data.frame(Y=Y, X=X) # observed data
(out.lm <- lm(Y ~ X, data=simulated_data)) # add restrictions/assumptions
##
## Call:
## lm(formula = Y ~ X, data = simulated_data)
##
## Coefficients:
## (Intercept) X
## 920.109 9.996
ggplot(simulated_data, aes(x=X, y=Y)) + geom_point() + geom_smooth(formula = y~x, method=lm, se=FALSE)
ggplot(simulated_data, aes(x=X, y=Y)) + geom_point() + geom_smooth(formula = y~x, method=lm, se=TRUE)
n <- 100 # sample size
n.reps <- 1000 # num of replications
theoretial_slope <- set_names(1:n.reps) %>%
map(~ tibble(X=rnorm(n, mean=0, sd=0.25), E=rnorm(n, mean=0, sd=1)) %>% mutate(Y = 920 + 10*X + E) %>% summarize(slope=lm(Y~X)$coef[2]) %>% as.numeric()) %>% unlist()
my_kde_plot <- ggplot(data=data.frame(theoretial_slope=theoretial_slope), aes(x=theoretial_slope)) + geom_density(fill='gray', adjust=1.5)
confidence_interval <- ggplot_build(my_kde_plot)$data[[1]] %>%
filter(x > quantile(theoretial_slope, 0.025)) %>%
filter(x < quantile(theoretial_slope, 0.975)) %>%
# https://ggplot2.tidyverse.org/reference/geom_ribbon.html
geom_area(mapping=aes(x=x,y=y, color='95% Confidence Interval'),
fill='red', alpha=0.25)
my_kde_plot + confidence_interval + theme(legend.position="bottom") +
ggtitle(paste0('If true slope was 10 and n=', n, sep=''))
out.lm <- lm(income ~ age, data = incex)
confint(out.lm)
## 2.5 % 97.5 %
## (Intercept) 894.114059 967.17031
## age 8.334461 10.01613
95% confidence interval of predicted means
predict(out.lm, data.frame(age = 30), interval = 'confidence')
## fit lwr upr
## 1 1205.901 1191.682 1220.12
age.pre <- 18:65
inc.pre <- predict(out.lm, data.frame(age = age.pre), interval = 'confidence')
kable(head(inc.pre))
| fit | lwr | upr |
|---|---|---|
| 1095.798 | 1073.385 | 1118.210 |
| 1104.973 | 1083.304 | 1126.641 |
| 1114.148 | 1093.216 | 1135.080 |
| 1123.323 | 1103.120 | 1143.527 |
| 1132.499 | 1113.014 | 1151.983 |
| 1141.674 | 1122.899 | 1160.449 |
par(mar=c(4, 4, 0.5, 2)) # bottom, left, top, right
plot(income ~ age, data = incex, cex = .4)
abline(out.lm, col = 'blue', lwd = 2) # add linear reg.
lines(age.pre, inc.pre[, 2], col = 'red') # add lower limit
lines(age.pre, inc.pre[, 3], col = 'red') # add upper limit
geom_smooth(..., se=TRUE, level=0.95) (or stat_smooth(..., geom = "smooth", se=TRUE, level=0.95))ggplot(incex, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') +
geom_smooth(method='lm', formula= y~x, col="red",
se = TRUE, level = 0.95, fill = "pink") + theme_bw()
Draw confidence envelopes with a subset of the first 100 observations.
For linear regression, confidence intervals are affected by sample size \(N\), the deviation from the sample mean of X (i.e., \(X-\bar{X}\)), variance of \(X\), and the sample residual variance (variance error of the estimate/regression).
incex2 <- incex[1:100,]
ggplot(incex2, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + theme_bw() +
geom_smooth(method='lm', formula= y~x, col="red",
se = TRUE, level = 0.95, fill = "pink")
geom_ribbonout.lm2 <- lm(income ~ oexp, incex2)
inc.pre2 <- data.frame(predict(out.lm2, interval = 'confidence'))
incex3 <- cbind(incex2, inc.pre2) # combine two datasets
kable(head(incex3))
| sex | age | edu | occ | oexp | income | fit | lwr | upr |
|---|---|---|---|---|---|---|---|---|
| female | 62 | low | low | 35 | 953 | 1380.246 | 1307.878 | 1452.614 |
| male | 32 | high | high | 6 | 1224 | 1203.481 | 1133.052 | 1273.909 |
| male | 56 | med. | high | 36 | 1466 | 1386.341 | 1311.134 | 1461.549 |
| female | 63 | med. | med. | 38 | 1339 | 1398.532 | 1317.440 | 1479.624 |
| male | 20 | low | low | 3 | 1184 | 1185.195 | 1106.191 | 1264.198 |
| female | 38 | med. | med. | 12 | 1196 | 1240.053 | 1184.069 | 1296.037 |
ggplot(incex3, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + theme_bw() +
geom_smooth(method='lm', formula= y~x, se = FALSE) +
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.3, fill="#69b3a2")
geom_ribbon and geom_lineggplot(incex3, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + theme_bw() +
geom_line(aes(y=fit), col="red") +
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.3, fill="#69b3a2")
ggplot(incex, aes(x=oexp, y=income, color=sex, fill=sex)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') +
geom_smooth(method='lm', formula= y~x, se = TRUE) + theme_bw()
ggplot(incex, aes(x=oexp, y=income, color=sex, fill=sex)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') +
geom_smooth(method='lm', formula= y~x, se = TRUE) +
facet_grid(~ sex) + theme_bw()
n.reps <- 1000 # num of replicates
bootstraps <- set_names(1:n.reps) %>%
map(~ geom_line(data=sample_n(simulated_data, size=n, replace=TRUE),
mapping=aes(x=X, y=Y),
stat="smooth",
formula='y~x', method='lm', se=FALSE, alpha=0.5))
bootstraps[[1]] <- ggplot(simulated_data, aes(x=X, y=Y)) + geom_point() + bootstraps[[1]]
Reduce("+", bootstraps)
bootstrap_slope <- set_names(1:n.reps) %>%
map(~ sample_n(simulated_data, size=n, replace=TRUE) %>%
summarize(slope=lm(Y~X)$coef[2]) %>% as.numeric()) %>% unlist()
my_histogram_plot <- ggplot(data=data.frame(bootstrap_slope=bootstrap_slope), aes(x=bootstrap_slope)) + geom_histogram(bins=20, fill='gray')
bootstrap_interval <- ggplot_build(my_histogram_plot)$data[[1]] %>%
filter(xmin > quantile(bootstrap_slope, 0.025)) %>%
filter(xmax < quantile(bootstrap_slope, 0.975)) %>%
geom_col(mapping=aes(y=y, x=(xmin+xmax)/2,
color='95% bootstrapped\nconfidence interval'),
fill='red', alpha=0.25)
my_histogram_plot + bootstrap_interval + theme(legend.position="bottom")
boot packageThere is a package boot with a function boot() that does the bootstrap for many situations. The function boot() requires three arguments: (1) the data from the original sample (here, incex); (2) a function to compute the statistics from the data where the first argument is the data and the second argument is the indices of the observations in the bootstrap sample; (3) the number of bootstrap replicates.
library(boot)
b.stat <- function(data, i)
{
b.dat <- data[i ,]
out.lm <- lm(income ~ oexp, b.dat)
predict(out.lm, data.frame(oexp=incex2$oexp))
}
incex2 <- incex[1:100,] # subset of the first 100 cases
b.out <- boot(incex2, b.stat, R = 1000) # R = num of replications
boot.ci(b.out, index = 1, type = 'perc') # 95% CI for the first observation
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = b.out, type = "perc", index = 1)
##
## Intervals :
## Level Percentile
## 95% (1311, 1456 )
## Calculations and Intervals on Original Scale
b.ci <- t(sapply(1:nrow(incex2), function(x) boot.ci(b.out, index = x, type = 'perc')$percent))[, 4:5]
dimnames(b.ci) <- list(rownames(incex2), c('lower', 'upper'))
kable(head(b.ci, 4))
| lower | upper |
|---|---|
| 1311.324 | 1456.113 |
| 1144.457 | 1268.871 |
| 1315.804 | 1464.962 |
| 1323.975 | 1481.654 |
incex4 <- cbind(incex2, b.ci) # combine two datasets
ggplot(incex4, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + theme_bw() +
geom_smooth(method='lm', formula= y~x, se = FALSE) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3, fill="#69b3a2")
# fit loess
out.lss <- loess(income ~ oexp, data = incex, span = .2, degree = 1) # local LINEAR regression, degree = 1
# get predicted means
x.val <- seq(0, 48, by = .5)
y.pred <- predict(out.lss, data.frame(oexp = x.val))
plot(income ~ oexp, data = incex, cex = .4, xlab = 'Occup. Experience',
ylab = 'Monthly Net Income', main = 'loess (LOcal regrESSion)')
lines(x.val, y.pred, col = 'blue', lwd = 2)
We can add confidence intervals/envelopes for non-parametric, loess regression.
Get standard errors for predicted means and construct confidence intervals.
loess.pred <- predict(out.lss, x.val, se=TRUE)
str(loess.pred)
## List of 4
## $ fit : num [1:97] 1195 1187 1179 1171 1164 ...
## $ se.fit : num [1:97] 18.8 16.2 14.4 13.7 13.9 ...
## $ residual.scale: num 234
## $ df : num 1911
plot(income ~ oexp, data = incex, cex = .4, xlab = 'Occup. Experience',
ylab = 'Monthly Net Income', main = 'loess (LOcal regrESSion)')
lines(x.val, y.pred, col = 'blue', lwd = 2)
lines(x.val, y.pred - qt(0.975, loess.pred$df)*loess.pred$se, lty=2, col="red")
lines(x.val, y.pred + qt(0.975, loess.pred$df)*loess.pred$se, lty=2, col="red")
ggplot.ggplot(incex, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') +
geom_smooth(method='loess', formula= y~x, col="red",
se = TRUE, level = 0.95, fill = "#69b3a2", span=.2) + theme_bw()
SDS data (SCS_QE.sav).scs$voc <- ifelse(scs$vm == 'Vocabulary', 1, 0) # factor to numeric
kable(head(scs[, c("numbmath", "vm", "voc")]))
| numbmath | vm | voc |
|---|---|---|
| 2 | Mathematics | 0 |
| 2 | Mathematics | 0 |
| 1 | Mathematics | 0 |
| 2 | Vocabulary | 1 |
| 2 | Vocabulary | 1 |
| 2 | Vocabulary | 1 |
par(mar=c(4, 4, 0.5, 2)) # bottom, left, top, right
plot(voc ~ jitter(numbmath), data = scs, xlim = c(0, 15), ylim = c(-.1, 1),
xlab = 'Number of math courses', ylab = 'Vocabulary training')
# fit models
out.lm <- lm(voc ~ numbmath, data = scs) # linear reg.
out.smth <- loess(voc ~ numbmath, scs) # loess reg
out.glm <- glm(vm ~ numbmath, family = 'binomial', data = scs) # logistic reg.
par(mar=c(4, 4, 0.5, 2)) # bottom, left, top, right
plot(voc ~ jitter(numbmath), data = scs, xlim = c(0, 15), ylim = c(-.1, 1),
xlab = 'Number of math courses', ylab = 'Vocabulary training')
abline(out.lm, lwd = 2) # add linear reg
lines(seq(0, 10, .1), predict(out.smth, data.frame(numbmath = seq(0, 10, .1))), col="blue", lwd = 2) # add loess
lines(0:15, predict(out.glm, data.frame(numbmath = 0:15), type = 'response'),
lwd = 2, col = 'red', lty = 2) # add logistic reg.
legend('topright', c('Linear-probability model', 'loess', 'Logit model (logistic regression)'),
col = c('black', 'blue', 'red'), lty = c(1, 1, 2), lwd = 2, bty = 'n')
Use ggplot.
To fit logistic regression in package ggplot2, you need to use a numeric outcome vector lying between 0 and 1.
Try with factor vm instead of numeric voc.
ggplot(scs, aes(x=numbmath, y=voc)) +
geom_jitter(height = 0.05) +
geom_smooth(method = "glm", method.args = list(family = "binomial")) + theme_bw()
ggplot(scs, aes(x=numbmath, y=voc)) +
geom_jitter(height = 0.05) +
geom_smooth(method = "glm", formula = y ~ splines::ns(x, 2), method.args = list(family = "binomial")) + theme_bw()